The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

User defined parameters

print(params)
## $test_run
## [1] FALSE
## 
## $save_figs
## [1] FALSE
## 
## $hmod
## [1] FALSE
## 
## $sample_group
## [1] 1
## 
## $response
## [1] "HerbCover_dec"
## 
## $s
## [1] "HerbCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
hmod <- params$hmod # whether to include human modification in the model
# by changing the sample_group the model can be fit to a completely different set of rows
sample_group <- params$sample_group 
response <- params$response
# _ann defines this model as being built on annual data
s <- params$s # string to paste to file names e.g., that defines the interactions in the model
# such as (summer ppt * MAT) and annuals*temperature interactions
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")

library(caret)
library(betareg)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())

read in data

Data compiled in the prepDataForModels.R script

modDat <- readRDS("../../../data/DataForModels.rds")

Prep data

set.seed(1)
modDat_1 <- modDat

# small dataset for if testing the data
if(test_run) {
  modDat_1 <- slice_sample(modDat_1, n = 1e5)
}

For now, not doing any resampling

set.seed(1234)

pred_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "PrecipTempCorr_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
# removed VPD, since it's highly correlated w/ tmean and prcp

names(pred_vars) <- pred_vars

# predictor vars are the same in both dfs

## remove rows for data that have NAs in the predictors (lag starts before range of DayMet data)

modDat_1 <- modDat_1 %>% 
  filter(!is.na(swe_meanAnnAvg_30yr) & 
           !is.na(tmin_meanAnnAvg_30yr) & 
           !is.na(prcp_meanAnnTotal_30yr) &
           !is.na(precip_Seasonality_meanAnnAvg_30yr) & 
           !is.na(PrecipTempCorr_meanAnnAvg_30yr) & 
           !is.na(paste(response)) )

df_pred <- modDat_1[, pred_vars]

Training data

df_sample <- if(fit_sample & !test_run) {
  reordered <- slice_sample(modDat_1, prop = 1)
  
  low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
  high <- low + n_train - 1 # last row
  if(high > nrow(modDat_1)) {
    stop('trying to sample from rows that dont exist')
  }
  reordered[low:high, ]
} else {
    modDat_1
}

df_test <- if(fit_sample & !test_run & 
              # antijoin only works if there are enough rows that meet 
              # that criterion, i.e. if df_sample contains most of the data i
              # doesnt' work
              (nrow(modDat_1) - nrow(df_sample) > n_test)) {
  modDat_1 %>% 
    anti_join(df_sample, by = c("cell_num", "year")) %>% 
    slice_sample(n = n_test)
} else {
  modDat_1 %>% 
    slice_sample(n = n_test)
}

# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)

Exploratory figs & summary values

create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1[pred_vars] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of predictor variables')
summaries of predictor variables
variable value_mean value_min value_median value_max
PrecipTempCorr_meanAnnAvg_30yr -0.1333 -0.8556 -0.1200 0.7145
annWetDegDays_meanAnnAvg_30yr 1656.2857 208.9903 1657.7056 2713.2516
isothermality_meanAnnAvg_30yr -37.1426 -61.3018 -36.8628 -20.2171
prcp_meanAnnTotal_30yr 760.5631 52.1693 538.0903 4116.7860
swe_meanAnnAvg_30yr 16.1970 0.0000 2.7609 1471.5825
tmean_meanAnnAvg_30yr 6.8020 1.7479 6.8581 10.3473
response_summary <- modDat_1 %>% 
    dplyr::select(#where(is.numeric), -all_of(pred_vars),
      matches(response)) %>% 
    create_summary()


kable(response_summary, 
      caption = 'summaries of response variables, calculated using paint')
summaries of response variables, calculated using paint
variable value_mean value_min value_median value_max
HerbCover_dec 0.1615 1e-04 0.055 0.999

Plot predictor vars against each other

here using pred dataframe, because smaller and this code is slow.

df_pred %>% 
  slice_sample(n = 5e4) %>% 
  #select(-matches("_")) %>% 
ggpairs(lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.2)), 
        columnLabels = c("swe", "tmean", "prcp", "prcpTempCorr", "isothermality", "wetDegDays"))

boxplots– # predictor variables compared to binned response variables

# vectors of names of response variables
vars_response <- response

# longformat dataframes for making boxplots
df_sample_plots <-  df_sample %>% 
  rename(response = all_of(response)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  select(c(response, pred_vars)) %>% 
  pivot_longer(cols = names(pred_vars), 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))
# df_biome_long <- 
#   # for some reason select() was giving me problems
#   # adding numYrs here so can take weighted average
#   predvars2long(modDat_1, response_vars = c(vars_response), 
#                 pred_vars = pred_vars) #%>% 
#    # mutate(across(all_of(vars_nfire), factor))


ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor, scales = 'free_y') + 
  ylab("Predictor Variable Values")

GLMs

fitting models

fitting various transforms

Creating formulas where each variable on its own is transformed numerous ways (including formula to fit spline), all other variables are left alone, that repeated for each variable. So have models with 1 variable transformed, 2 transformed, etc.

see documentation for glms_iterate_transforms, in the modelling_functions.R script

set.seed(1234)

# adding an interaction term to help deal with over-predicting fire probability
# at pixels with high afgAGB and high MAP. parentheses around interaction
# term are included so that glms_iterate_transforms doesn't transform
# the interaction term. 
pred_vars_inter <- c(pred_vars, params$inter)


# pred_vars_inter <- pred_vars

mods_glm_cwf1 <- glmTransformsIterates(
  preds = pred_vars_inter, 
  df = df_sample,
  response_var = response, 
  delta_aic = 10)
## [1] "step1"
## 
##  -69532.46 
## -70133.59 
## delta aic cutoff 10 
## [1] "step2"
## 
##  -70133.59 
## -70660.77 
## delta aic cutoff 10 
## [1] "step3"
## 
##  -70660.77 
## -71054.39 
## delta aic cutoff 10 
## [1] "step4"
## 
##  -71054.39 
## -71454.99 
## delta aic cutoff 10 
## [1] "step5"
## 
##  -71454.99 
## -71700.17 
## delta aic cutoff 10 
## [1] "step6"
## 
##  -71700.17 
## -71938.53 
## delta aic cutoff 10
# not including the last element of the list which is final_formula
mods_glm_cwf2 <- mods_glm_cwf1[-length(mods_glm_cwf1)]

map_dfc(mods_glm_cwf2, ~ names(.$aic[1:5])) %>% 
  kable(caption = "5 best transformations at each step")
5 best transformations at each step
step1 step2 step3 step4 step5 step6
convert_poly2log10_swe_meanAnnAvg_30yr convert_poly2_annWetDegDays_meanAnnAvg_30yr convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr convert_poly2_prcp_meanAnnTotal_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr
convert_poly2_prcp_meanAnnTotal_30yr convert_poly2sqrt_annWetDegDays_meanAnnAvg_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2sqrt_prcp_meanAnnTotal_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr
convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_log10_PrecipTempCorr_meanAnnAvg_30yr convert_log10_prcp_meanAnnTotal_30yr convert_poly2log10_tmean_meanAnnAvg_30yr convert_poly2log10_tmean_meanAnnAvg_30yr
convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2_isothermality_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_poly2log10_prcp_meanAnnTotal_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_log10_tmean_meanAnnAvg_30yr
convert_log10_PrecipTempCorr_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2sqrt_tmean_meanAnnAvg_30yr convert_poly2_tmean_meanAnnAvg_30yr convert_log10_tmean_meanAnnAvg_30yr convert_sqrt_tmean_meanAnnAvg_30yr
best_aic_by_step <- map_dbl(mods_glm_cwf2, ~.$aic[1])

best_aic_by_step <- c(step0 = mods_glm_cwf1$step1$aic[['convert_none']], 
                      best_aic_by_step)
# AIC improvement by step
diff(best_aic_by_step)
##     step1     step2     step3     step4     step5     step6 
## -601.1261 -527.1761 -393.6207 -400.6029 -245.1797 -238.3640
plot(y = best_aic_by_step, 
     x = 1:length(best_aic_by_step) - 1,
     ylab = "AIC",
     xlab = "Number of variables transformed")

Best transformation each step.

var_transformed <- map(mods_glm_cwf2, function(x) x$var_transformed)
var_transformed
## $step1
## [1] "stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)"
## 
## $step2
## [1] "stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)"
## 
## $step3
## [1] "stats::poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr+1))),2,raw=TRUE)"
## 
## $step4
## [1] "stats::poly(prcp_meanAnnTotal_30yr,2,raw=TRUE)"
## 
## $step5
## [1] "stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)"
## 
## $step6
## [1] "stats::poly(I(sqrt(tmean_meanAnnAvg_30yr)),2,raw=TRUE)"

adding interaction terms

Plot potential interaction terms using raw data

across values of swe

# calculate quantiles
swe_quants <- quantile(df_sample$swe_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_swe <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
         swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
         prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(swe_meanAnnAvg_30yr_quants = case_when(
    swe_meanAnnAvg_30yr <= swe_quants[2] ~ swe_quants[2], 
    swe_meanAnnAvg_30yr > swe_quants[2]  & swe_meanAnnAvg_30yr <= swe_quants[3]  ~ swe_quants[3], 
    swe_meanAnnAvg_30yr > swe_quants[3]  & swe_meanAnnAvg_30yr <= swe_quants[4]  ~ swe_quants[4], 
    swe_meanAnnAvg_30yr >= swe_quants[4]  ~ swe_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:16, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of SWE 
ggplot(data = df_sample_swe[!(df_sample_swe$predictor %in% c("swe_log", "swe_log_squared", "swe_meanAnnAvg_30yr")),]) + 
  geom_point(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), alpha = .5) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(swe_meanAnnAvg_30yr_quants)), method = "glm") + 
  facet_wrap(~predictor, scales = "free_x")

Mean Annual Temperature

tmean_quants <- quantile(df_sample$tmean_meanAnnAvg_30yr, na.rm = TRUE)

# longformat dataframe for making boxplots
df_sample_tmean <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
         swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
         prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(tmean_meanAnnAvg_30yr_quants = case_when(
    tmean_meanAnnAvg_30yr <= tmean_quants[2] ~ tmean_quants[2], 
    tmean_meanAnnAvg_30yr > tmean_quants[2]  & tmean_meanAnnAvg_30yr <= tmean_quants[3]  ~ tmean_quants[3], 
    tmean_meanAnnAvg_30yr > tmean_quants[3]  & tmean_meanAnnAvg_30yr <= tmean_quants[4]  ~ tmean_quants[4], 
    tmean_meanAnnAvg_30yr >= tmean_quants[4]  ~ tmean_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:16, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of tmean 
ggplot(data = df_sample_tmean[!(df_sample_tmean$predictor %in% c("tmean_log", "tmean_log_squared", "tmean_meanAnnAvg_30yr")),]) + 
  geom_point(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), alpha = .3) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(tmean_meanAnnAvg_30yr_quants)), method = "glm") + 
  facet_wrap(~predictor, scales = "free_x")

Mean annual precipitation

prcp_quants <- quantile(df_sample$prcp_meanAnnTotal_30yr, na.rm = TRUE)

# longformat dataframe for making boxplots
df_sample_prcp <-  df_sample %>% 
  select(c(response, pred_vars)) %>% 
  # transform the predictors according to above process
  mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
         swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
         swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
         tmean_log =            as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
         tmean_log_squared =    as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
         prcp_log =             as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
         prcp_log_squared =     as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
         prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
         ) %>% 
  rename(response = all_of(response)) %>% 
  # mutate(response = case_when(
  #   response <= .25 ~ ".25", 
  #   response > .25 & response <=.5 ~ ".5", 
  #   response > .5 & response <=.75 ~ ".75", 
  #   response >= .75  ~ "1", 
  # )) %>% 
  # pivot_longer(cols = 2:16, 
  #              names_to = "predictor", 
  #              values_to = "value") %>% 
  # filter(!is.na(response) & !is.na(value))
  mutate(prcp_meanAnnTotal_30yr_quants = case_when(
    prcp_meanAnnTotal_30yr <= prcp_quants[2] ~ prcp_quants[2], 
    prcp_meanAnnTotal_30yr > prcp_quants[2]  & prcp_meanAnnTotal_30yr <= prcp_quants[3]  ~ prcp_quants[3], 
    prcp_meanAnnTotal_30yr > prcp_quants[3]  & prcp_meanAnnTotal_30yr <= prcp_quants[4]  ~ prcp_quants[4], 
    prcp_meanAnnTotal_30yr >= prcp_quants[4]  ~ prcp_quants[5], 
  )) %>% 
  pivot_longer(cols = 2:16, 
               names_to = "predictor", 
               values_to = "value") %>% 
  filter(!is.na(response) & !is.na(value))

# plot relationships amongst variables according to different levels of tmean 
ggplot(data = df_sample_prcp[!(df_sample_prcp$predictor %in% c("prcp_log", "prcp_log_squared", "prcp_meanAnnTotal_30yr")),]) + 
  geom_point(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), alpha = .3) +
  geom_smooth(aes(y = response, x = value,  col = as.factor(prcp_meanAnnTotal_30yr_quants)), method = "glm") + 
  facet_wrap(~predictor, scales = "free_x")

we’ll try running the model w/ interactions for swe:isothermality, swe:tmean, tmean:precip, and prcp:swe

best_form <- mods_glm_cwf1$final_formula
print(best_form)
##                                                                                                                                                                                                                                                                                                                                           convert_poly2sqrt_tmean_meanAnnAvg_30yr 
## "HerbCover_dec ~ stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE) + stats::poly(I(sqrt(tmean_meanAnnAvg_30yr)),2, raw = TRUE) + stats::poly(prcp_meanAnnTotal_30yr,2,raw=TRUE) + stats::poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr+1))),2,raw=TRUE) + stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE) + stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)"
# formula w/ interactions included
modWithInteractions <- paste(best_form, c("+ I(log10(I(swe_meanAnnAvg_30yr+1))):isothermality_meanAnnAvg_30yr + I(log10(I(swe_meanAnnAvg_30yr+1))):I(log10(I(tmean_meanAnnAvg_30yr+1))) + I(log10(I(tmean_meanAnnAvg_30yr+1))):I(log10(I(prcp_meanAnnTotal_30yr+1))) + I(log10(I(prcp_meanAnnTotal_30yr+1))):I(log10(I(swe_meanAnnAvg_30yr+1)))")) %>% 
  str_remove_all("stats::")

# refitting the glm with the best formula
mod_glm1 <- betareg(as.formula(modWithInteractions), data = df_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))

performing backwards AIC-based model selection

Using the “StepBeta” function from the StepBeta R package

modSelection <- StepBeta_mine(mod_glm1, data = df_sample)
## [1] "100 % of the process"

same model for all response variables

Response variables are the proportion of years in which fires occurred. Using best model formula selected in the previous step

best_form <- modSelection$formula
print(best_form)
## HerbCover_dec ~ poly(I(sqrt(tmean_meanAnnAvg_30yr)), 2, raw = TRUE) + 
##     poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))), 2, 
##         raw = TRUE) + poly(I(log10(I(swe_meanAnnAvg_30yr + 1))), 
##     2, raw = TRUE) + poly(annWetDegDays_meanAnnAvg_30yr, 2, raw = TRUE) + 
##     poly(prcp_meanAnnTotal_30yr, 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 
##     2, raw = TRUE) + I(log10(I(swe_meanAnnAvg_30yr + 1))):I(log10(I(tmean_meanAnnAvg_30yr + 
##     1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):I(log10(I(prcp_meanAnnTotal_30yr + 
##     1))) + I(log10(I(swe_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr + 
##     I(log10(I(tmean_meanAnnAvg_30yr + 1))):I(log10(I(prcp_meanAnnTotal_30yr + 
##         1)))
## <environment: 0x644a4ec88>
# refitting the glm with the best formula
mod_glmFinal <- betareg(as.formula(best_form), data = df_sample, link = c("logit"), link.phi = NULL,
                              type = c("ML"))
# fit using glmmTMB for subsequent analyses (Same output)
#mod_glmFinal_glmmTMB <- glmmTMB::glmmTMB(as.formula(best_form), data = df_sample, family=beta_family(link="logit"))

# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal)
## [1] -73864.09
#summary(mod_glmFinal_glmmTMB)

partial dependence & VIP

PDP plot trend made using a small sample of the data

#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(mod_glmFinal)

observed vs. predicted

Predicting on the data

# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df

  response_name <- paste0(response, "_pred")
  df_out[[response_name]] <- predict(mod, df, type = 'response')
  df_out
}

pred_glm1 <- predict_by_response(mod_glmFinal, df_test)

Deciles

Binning predictor variables into deciles (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word deciles is just a legacy thing (they started out being actual deciles)

Then predicting on an identical dataset but with warming

var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)

pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                      pred_vars = pred_vars)

Publication quality quantile plot

# publication quality version
g <- decile_dotplot_pq(pred_glm1_deciles, response = response)

if(!hmod) {
# obs/pred inset
g2 <- add_dotplot_inset(g, pred_glm1_deciles)
} else {
  g2 <- g
}
g2

if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_v5", s,  ".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g2)
  dev.off()
}

Deciles Filtered

20th and 80th percentiles for each climate variable

df <- pred_glm1[, pred_vars] #%>% 
  #mutate(MAT = MAT - 273.15) # k to c
map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
## $swe_meanAnnAvg_30yr
##         20%         80% 
##  0.03122831 16.79101381 
## 
## $tmean_meanAnnAvg_30yr
##      20%      80% 
## 5.852257 7.806215 
## 
## $prcp_meanAnnTotal_30yr
##       20%       80% 
##  324.5293 1261.6810 
## 
## $PrecipTempCorr_meanAnnAvg_30yr
##        20%        80% 
## -0.5890414  0.2725700 
## 
## $isothermality_meanAnnAvg_30yr
##       20%       80% 
## -41.37623 -33.05065 
## 
## $annWetDegDays_meanAnnAvg_30yr
##      20%      80% 
## 1345.267 2034.182

Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each climate variable.

clim_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "precip_Seasonality_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = pred_vars,
                         filter_var = TRUE,
                         filter_vars = pred_vars) 

decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = clim_vars)

#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)

Filtered quantile figure with middle 2 deciles also shown (this is very memory intensive so no running at the moment)

pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = pred_vars,
                         filter_vars = pred_vars,
                         filter_var = TRUE,
                         add_mid = TRUE)

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = pred_vars)
g

if(save_figs) {
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", s, ".jpeg"),
     units = "in", res = 600, width = 5.5, height = 6 )
  g 
dev.off()
}

Save output

# glm models
mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge

mods2save$formula <- best_form
mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
n <- nrow(df_sample)
mods2save$data_rows <- n

if(!test_run) {
  saveRDS(mods2save, 
        paste0("./models/glm_beta_model_FirstPass_", s, "_n", n, 
        #sample_group, 
        ".RDS"))
}

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "12f42926c3fdbe0f575a7492a3aee5378726ee9f"

Packages etc.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.4.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] caret_6.0-94    lattice_0.22-6  ggtext_0.1.2    knitr_1.46      gridExtra_2.3   pdp_0.8.1      
##  [7] GGally_2.2.1    dtplyr_1.3.1    patchwork_1.2.0 lmtest_0.9-40   zoo_1.8-12      StepBeta_2.1.0 
## [13] sf_1.0-16       corrplot_0.92   glmmTMB_1.1.9   betareg_3.1-4   statmod_1.5.0   terra_1.7-78   
## [19] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2     readr_2.1.5    
## [25] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3   jsonlite_1.8.8       rstudioapi_0.16.0    magrittr_2.0.3      
##   [5] modeltools_0.2-23    farver_2.1.2         nloptr_2.0.3         rmarkdown_2.27      
##   [9] vctrs_0.6.5          minqa_1.2.7          butcher_0.3.4        htmltools_0.5.8.1   
##  [13] progress_1.2.3       Formula_1.2-5        pROC_1.18.5          sass_0.4.9          
##  [17] parallelly_1.37.1    bslib_0.7.0          KernSmooth_2.23-22   plyr_1.8.9          
##  [21] sandwich_3.1-0       cachem_1.1.0         TMB_1.9.12           commonmark_1.9.1    
##  [25] lifecycle_1.0.4      iterators_1.0.14     pkgconfig_2.0.3      Matrix_1.7-0        
##  [29] R6_2.5.1             fastmap_1.2.0        future_1.33.2        digest_0.6.35       
##  [33] numDeriv_2016.8-1.1  colorspace_2.1-0     pkgload_1.3.4        labeling_0.4.3      
##  [37] fansi_1.0.6          timechange_0.3.0     abind_1.4-5          mgcv_1.9-1          
##  [41] compiler_4.4.0       proxy_0.4-27         aod_1.3.3            withr_3.0.0         
##  [45] carData_3.0-5        DBI_1.2.3            ggstats_0.6.0        highr_0.10          
##  [49] MASS_7.3-60.2        lava_1.8.0           classInt_0.4-10      ModelMetrics_1.2.2.2
##  [53] tools_4.4.0          units_0.8-5          future.apply_1.11.2  nnet_7.3-19         
##  [57] glue_1.7.0           nlme_3.1-164         gridtext_0.1.5       grid_4.4.0          
##  [61] reshape2_1.4.4       generics_0.1.3       recipes_1.1.0        gtable_0.3.5        
##  [65] tzdb_0.4.0           class_7.3-22         data.table_1.15.4    hms_1.1.3           
##  [69] xml2_1.3.6           car_3.1-2            utf8_1.2.4           flexmix_2.3-19      
##  [73] foreach_1.5.2        pillar_1.9.0         markdown_1.13        splines_4.4.0       
##  [77] survival_3.5-8       tidyselect_1.2.1     stats4_4.4.0         xfun_0.44           
##  [81] hardhat_1.4.0        timeDate_4032.109    stringi_1.8.4        yaml_2.3.8          
##  [85] boot_1.3-30          evaluate_0.23        codetools_0.2-20     vip_0.4.1           
##  [89] cli_3.6.2            rpart_4.1.23         munsell_0.5.1        jquerylib_0.1.4     
##  [93] Rcpp_1.0.12          globals_0.16.3       parallel_4.4.0       gower_1.0.1         
##  [97] prettyunits_1.2.0    lme4_1.1-35.3        listenv_0.9.1        ipred_0.9-15        
## [101] scales_1.3.0         prodlim_2024.06.25   e1071_1.7-14         crayon_1.5.2        
## [105] combinat_0.0-8       rlang_1.1.4          cowplot_1.1.3